################################################################################
setwd("N:/Empirikom/Daten/")

# TCF einlesen - ANFANG

tcf.einlesen<-function(dateiname)
  {
  print(dateiname)

  # tokID einlesen - ANFANG

  tcf.datei<-readLines(con=dateiname, encoding="UTF-8")

  tokens_start<-sum(charmatch(c("    <ns3:tokens>",
                                "    <tc:tokens>"),
                              tcf.datei,
                              nomatch=0))+1

  tokens_ende<-sum(charmatch(c("    </ns3:tokens>",
                               "    </tc:tokens>"),
                             tcf.datei,
                             nomatch=0))-1

  tcf.tokens<-tcf.datei[tokens_start:tokens_ende]
  rm(tokens_start, tokens_ende)

  tcf.partitur<-data.frame(tokID=rep(NA, length(tcf.tokens)))

  for(i in 1:length(tcf.tokens))
    {
    tokenid<-strsplit(x=tcf.tokens[[i]][1], split="ID=\"")
    tokenid<-strsplit(x=tokenid[[1]][2], split="\"")
    tcf.partitur$tokID[i]<-as.character(tokenid[[1]][1])
    }
    rm(i, tokenid)

  # tokID einlesen - ENDE

  # tok einlesen - ANFANG

  tcf.partitur$tok<-rep(NA, nrow(tcf.partitur))

  for(i in 1:length(tcf.tokens))
    {
    token<-strsplit(x=tcf.tokens[[i]][1], split="\">")
    token<-strsplit(x=token[[1]][2], split="</")
    tcf.partitur$tok[i]<-as.character(token[[1]][1])
    }
  rm(i, token, tcf.tokens)

  # tok einlesen - ENDE

  # seg einlesen - ANFANG

  segmente_start<-sum(charmatch(c("    <ns3:sentences>",
                                  "    <tc:sentences>"),
                                tcf.datei,
                                nomatch=0))+1

  segmente_ende<-sum(charmatch(c("    </ns3:sentences>",
                                 "    </tc:sentences>"),
                               tcf.datei,
                               nomatch=0))-1

  tcf.segmente<-tcf.datei[segmente_start:segmente_ende]
  rm(segmente_start, segmente_ende)

  for(i in 1:length(tcf.segmente))
    {
    tcf.segmente[i]<-strsplit(x=tcf.segmente[[i]][1], split="tokenIDs=\"")
    tcf.segmente[i]<-tcf.segmente[[i]][2]

    tcf.segmente[i]<-strsplit(x=tcf.segmente[[i]][1], split="\">")
    tcf.segmente[i]<-tcf.segmente[[i]][1]

    tcf.segmente[i]<-strsplit(x=tcf.segmente[[i]][1], split=" ")
    }
  rm(i)

  tcf.partitur$seg<-rep(NA, nrow(tcf.partitur))

  for(i in 1:nrow(tcf.partitur))
    {
    ifelse(i==1, {j<-1}, {j<-tcf.partitur$seg[i-1]})

    while(j<=length(tcf.segmente))
      {
      if(as.logical(sum(tcf.partitur$tokID[i]==tcf.segmente[[j]])))
        {
        tcf.partitur$seg[i]<-j
        break
        }
      j<-j+1
      }
    }
  rm(i, j, tcf.segmente)

  # seg einlesen - ENDE

  # lemma einlesen - ANFANG

  lemmas_start<-sum(charmatch(c("    <ns3:lemmas>",
                                "    <tc:lemmas>"),
                              tcf.datei,
                              nomatch=0))+1

  lemmas_ende<-sum(charmatch(c("    </ns3:lemmas>",
                               "    </tc:lemmas>"),
                             tcf.datei,
                             nomatch=0))-1

  tcf.lemmas<-tcf.datei[lemmas_start:lemmas_ende]
  rm(lemmas_start, lemmas_ende)

  tcf.partitur$lemma<-rep(NA, nrow(tcf.partitur))

  for(i in 1:length(tcf.lemmas))
    {
    tokenid<-strsplit(x=tcf.lemmas[[i]][1], split="IDs=\"")
    tokenid<-strsplit(x=tokenid[[1]][2], split="\"")
    tokenid<-tokenid[[1]][1]

    lemma<-strsplit(x=tcf.lemmas[[i]][1], split="\">")
    lemma<-strsplit(x=lemma[[1]][2], split="</")
    lemma<-lemma[[1]][1]

    tcf.partitur$lemma[tcf.partitur$tokID==tokenid]<-lemma
    }
  rm(i, tokenid, lemma, tcf.lemmas)

  # lemma einlesen - ENDE

  # pos einlesen - ANFANG

  postags_start<-sum(charmatch(c("    <ns3:POStags tagset=\"STTS\">",
                                 "    <tc:POStags tagset=\"STTS\">"),
                               tcf.datei,
                               nomatch=0))+1

  postags_ende<-sum(charmatch(c("    </ns3:POStags>",
                                "    </tc:POStags>"),
                              tcf.datei,
                              nomatch=0))-1

  tcf.postags<-tcf.datei[postags_start:postags_ende]
  rm(postags_start, postags_ende)

  tcf.partitur$pos<-rep(NA, nrow(tcf.partitur))

  for(i in 1:length(tcf.postags))
    {
    tokenid<-strsplit(x=tcf.postags[[i]][1], split="IDs=\"")
    tokenid<-strsplit(x=tokenid[[1]][2], split="\"")
    tokenid<-tokenid[[1]][1]

    postag<-strsplit(x=tcf.postags[[i]][1], split="\">")
    postag<-strsplit(x=postag[[1]][2], split="</")
    postag<-postag[[1]][1]

    tcf.partitur$pos[tcf.partitur$tokID==tokenid]<-postag
    }
  rm(i, tokenid, postag, tcf.postags)

  # pos einlesen - ENDE

  # gov und dep einlesen - ANFANG

  dependencies_start<-sum(charmatch(c("    <ns3:depparsing emptytoks=\"true\" multigovs=\"false\" tagset=\"NoSta-D-DEP\">",
                                 "    <tc:depparsing emptytoks=\"true\" multigovs=\"false\" tagset=\"NoSta-D-DEP\">"),
                                    tcf.datei,
                                    nomatch=0))+1

  dependencies_ende<-sum(charmatch(c("    </ns3:depparsing>",
                                     "    </tc:depparsing>"),
                                   tcf.datei,
                                   nomatch=0))-1

  tcf.dependencies<-tcf.datei[dependencies_start:dependencies_ende]
  rm(dependencies_start, dependencies_ende)

  tcf.dependencies<-tcf.dependencies[tcf.dependencies!="      <ns3:parse>" &
                                     tcf.dependencies!="      </ns3:parse>" &
                                     tcf.dependencies!="      <tc:parse>" &
                                     tcf.dependencies!="      </tc:parse>"]

  tcf.partitur$govID<-rep(NA, nrow(tcf.partitur))
  tcf.partitur$dep<-rep(NA, nrow(tcf.partitur))

  for(i in 1:length(tcf.dependencies))
    {
    depid<-strsplit(x=tcf.dependencies[[i]][1], split="depIDs=\"")
    depid<-strsplit(x=depid[[1]][2], split="\"")
    depid<-depid[[1]][1]

    govid<-strsplit(x=tcf.dependencies[[i]][1], split="govIDs=\"")
    govid<-strsplit(x=govid[[1]][2], split="\"")
    govid<-govid[[1]][1]

    label<-strsplit(x=tcf.dependencies[[i]][1], split="func=\"")
    label<-strsplit(x=label[[1]][2], split="\"")
    label<-label[[1]][1]

    if(!is.na(tcf.partitur$govID[tcf.partitur$tokID==depid]))
      {
      print(noquote(paste("Mehr als ein Regent fr Token",
                          depid,
                          "aus Segment",
                          tcf.partitur$seg[tcf.partitur$tokID==depid],
                          sep=" ")))
      break
      }

    tcf.partitur$govID[tcf.partitur$tokID==depid]<-govid
    tcf.partitur$dep[tcf.partitur$tokID==depid]<-label
    }
  rm(i, depid, govid, label, tcf.dependencies)

  # gov und dep einlesen - ENDE

  rm(tcf.datei)

  # Textname, Subkorpus, Edition hinzufgen - ANFANG

  tcf.partitur$text<-substr(dateiname, 1, nchar(dateiname)-9)

  tcf.partitur$subkorpus<-substr(dateiname, 1, 5)
  tcf.partitur$subkorpus[tcf.partitur$subkorpus=="ansel"]<-"anselm"
  tcf.partitur$subkorpus[tcf.partitur$subkorpus=="bemat"]<-"bematac"

  tcf.partitur$subkorpus[substr(tcf.partitur$subkorpus, 1, 3)=="fk0" |
                         substr(tcf.partitur$subkorpus, 1, 3)=="cbs"]<-"falko"

  tcf.partitur$subkorpus[tcf.partitur$subkorpus=="tueba"]<-"tuebadz"
  tcf.partitur$subkorpus[tcf.partitur$subkorpus=="unicu"]<-"unicum"

  tcf.partitur$edition<-substr(dateiname, nchar(dateiname)-7, nchar(dateiname)-4)

  # Textname, Subkorpus, Edition hinzufgen - ENDE

  return(tcf.partitur)
  rm(tcf.partitur)
  }

# TCF einlesen - ENDE

nostad<-do.call(rbind, lapply(list.files(pattern=".tcf"), tcf.einlesen))
nostad$subkorpus<-as.factor(nostad$subkorpus)
nostad$edition<-as.factor(nostad$edition)

rm(tcf.einlesen)

# Korpus in .RData speichern

save(nostad, file="NoSta-D-1.4.RData")

# Segment-Ausgabe-Funktion

segment_ausgeben<-function(Text, Segment)
  {
  print(paste(Text, Segment, sep=" "))

  print(paste(nostad$tok[nostad$text==Text &
                         nostad$edition=="orig" &
                         nostad$seg==Segment],
              collapse=" "))

  print(paste(nostad$tok[nostad$text==Text &
                         nostad$edition=="norm" &
                         nostad$seg==Segment],
              collapse=" "))
  }